YouTube Source: https://youtu.be/eQU8Zd1B9tM

Import and Clean Amazon Product Reviews

taras_caramels <- read_csv("./taras_caramels.csv") %>%
  janitor::clean_names() %>% select(-id,-profile_name,-images)
## Rows: 233 Columns: 8
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): id, profileName, text, date, title, images
## dbl (2): rating, helpful
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
taras_caramels$text <- tolower(taras_caramels$text)
# Extract Dates from test column
taras_caramels$Ymd <- lubridate::mdy(taras_caramels$date)
taras_caramels <- taras_caramels %>% select(Ymd,title:helpful)  %>%
  rename(review = title)
summary(taras_caramels)
##       Ymd                review              rating         helpful        
##  Min.   :2018-05-16   Length:233         Min.   :1.000   Min.   :  0.0000  
##  1st Qu.:2019-09-04   Class :character   1st Qu.:3.000   1st Qu.:  0.0000  
##  Median :2020-09-28   Mode  :character   Median :5.000   Median :  0.0000  
##  Mean   :2020-06-27                      Mean   :3.811   Mean   :  0.9828  
##  3rd Qu.:2021-06-01                      3rd Qu.:5.000   3rd Qu.:  0.0000  
##  Max.   :2021-11-19                      Max.   :5.000   Max.   :118.0000
ggplot(taras_caramels) + geom_histogram(aes(x=rating),bins = 9) +
  labs(title = "Count of  the Number Reviews by Rating")

p1 <-taras_caramels %>% count(Ymd) %>% ggplot() + geom_col(aes(x=Ymd,y=n)) +
  labs(title = "Number of Reviews per Day",y="Number of Reviews")
ggplotly(p1)

Analyize Words Using “sentimentr” Package

Commands from Video (see above)

head(sentiment(taras_caramels$review),25)
##     element_id sentence_id word_count   sentiment
##  1:          1           1          4 -0.25000000
##  2:          1           2          8  0.35355339
##  3:          1           3          5  0.44721360
##  4:          2           1          5  0.22360680
##  5:          3           1         11  0.00000000
##  6:          4           1         10  0.04743416
##  7:          5           1          9  0.61683333
##  8:          6           1          2  0.53033009
##  9:          7           1          3  0.28867513
## 10:          8           1          4  0.30000000
## 11:          9           1          1  1.00000000
## 12:         10           1          5 -0.50311529
## 13:         11           1          6  0.36742346
## 14:         12           1          1 -0.50000000
## 15:         12           2          3 -0.57735027
## 16:         13           1          1  0.00000000
## 17:         13           2          3  0.00000000
## 18:         14           1          1  0.50000000
## 19:         15           1          3  0.43301270
## 20:         16           1          4 -0.50000000
## 21:         17           1          1  0.50000000
## 22:         17           2          1  0.00000000
## 23:         17           3          1  0.00000000
## 24:         18           1          6  0.00000000
## 25:         19           1          6 -0.30618622
##     element_id sentence_id word_count   sentiment
sentiment_by(taras_caramels$review)
##      element_id word_count       sd ave_sentiment
##   1:          1         17 0.378408    0.18358900
##   2:          2          5       NA    0.22360680
##   3:          3         11       NA    0.00000000
##   4:          4         10       NA    0.04743416
##   5:          5          9       NA    0.61683333
##  ---                                             
## 229:        229         14       NA    0.06681531
## 230:        230          1       NA    0.50000000
## 231:        231          6       NA   -0.20412415
## 232:        232          3       NA    0.08660254
## 233:        233          6       NA    0.24494897

Sentence Structure

taras_sentence <- taras_caramels %>% select(review) %>%
  get_sentences() %>%
  sentiment()

taras_sentence %>% ggplot() + geom_density(aes(x=sentiment)) +
  labs(title="Density Plot of Rating Sentiment",
       x=" <-- Negative(-) Bad  but Plus(+) Good --> ")

taras_sentence %>% ggplot() + geom_histogram(aes(x=sentiment)) +
  labs(title="Histogram of Rating Sentiment",
       x=" <-- Negative(-) Bad  but Plus(+) Good --> ")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

taras_sentence <- taras_sentence %>%
  mutate(plus_minus = if_else(sentiment >0,"Positve","Negative")) 

taras_sentence %>% count(plus_minus,sort =T) %>%
  ggplot(aes(x=plus_minus,y=n)) + geom_col() +
  labs(title = "Negative vs Postive Ratings",
       y="Number of Ratings")

Begin TidyText Package Analysis

taras_tokens <- taras_caramels %>% select(review) %>%
  unnest_tokens(word, review) 
taras_tokens  %>% count(word,sort =T ) %>% top_n(25) %>%
  ggplot(aes(x=reorder(word,n),y=n)) + geom_col() + coord_flip()
## Selecting by n

data(stop_words)
taras_clean <- taras_tokens %>% 
  anti_join(stop_words)
## Joining, by = "word"
taras_clean %>% count(word,sort =T ) %>% top_n(25) %>%
  ggplot(aes(x=reorder(word,n),y=n)) + geom_col() + coord_flip() +
  labs(title ="Count of Top 25 Words",
       x="Count of Words","Token Words",y="Word Counts")
## Selecting by n

Construct a Word Cloud

taras_clean %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 200))